In the aforementioned paper, the data of three different settings are considered. The corresponding data are are gathered in four different files, corresponding to different Figures in the paper:
TABLE_S1_COM1.xlsx: Figure 1TABLE_S2_COM2.xlsx: Figure 2-6TABLE_S3_COM3.xlsx: Figure 7detailed_TABLE_S3.csv: Figure 8, 9 and 10The file detailed_TABLE_S3.csv gathers data from the
same study as the TABLE_S3_COM3.xlsx but with a different
resolution. In the file TABLE_S3_COM3.xlsx, the number of
hits of each detected species is aggregated (i.e. summed) over 4
modalities (A to D), irrespective of the
sample itself (i.e. the berry or the insect). In
data_sour_rot.xlsx, the number of hits is given for each
species and each sample (and not aggregated over modalities),
but only the samples of the modalities B, C
and D are considered.
The code to reproduce the Figures of the paper is provided here. In addition to the code, an interactive version of the plots is provided, so that details of the plots can be better explored.
Throughout the paper, the color used to display genera and orders of the microbes detected are separated in three categories:
In general, the genera and orders displayed in Figure 1-6 and 10 are consistent: the same colors display the same genera or orders for all the Figures, and the colors of the orders correspond to the color of one of the genus within this order. This consistency is only approximate for the Figure 7.
The Figures 1-7 and 10 are saved in three different formats:
.png for a quick view, .eps for the journal
and .html for an interactive version, allowing for zooming
and details about the displayed values.
We first load the data described in the first Table and do some basic processing, and in particular, we extract the name of the species.
dfExp1 <- read_excel(path = "Data/TABLE_S1_COM1.xlsx", skip = 1)
dfExp1 <- dfExp1 |>
# selects interesting columns
select(`Sequences 16S or ITS named after verification (in LPSN for bacteria and Mycobank/Index Fungorum for fungi) of the name(s) obtained by sequence similarity searches in GenBank.`, A, B, C, D, `Order; family (after GenBank)`) |>
# changes the way hits are stored; instead, on row corresponds to one pair hit/category
pivot_longer(cols = c(A, B, C, D), names_to = "Modality", values_to = "hits") |>
mutate(Modality = factor(Modality)) |>
# rename unconvinent columns
rename(Species =`Sequences 16S or ITS named after verification (in LPSN for bacteria and Mycobank/Index Fungorum for fungi) of the name(s) obtained by sequence similarity searches in GenBank.`,
OrderFamily = `Order; family (after GenBank)`) |>
# gets the order and the family
separate_wider_delim(OrderFamily, delim = ";", names = c("Order", "Family")) |>
# removes the unnecessary column
mutate(OrderFamily = NULL) |>
# removes some initial spaces and brackets
mutate(Genus = gsub(" .*$", "", x = Species) |> gsub("\\[|\\]", "", x=_)) |>
# changes unconvential spaces into normal ones
mutate(across(.cols = c(Genus, Family, Species, Order), .fns = ~ gsub("\\p{Zs}", " ", x =., perl = TRUE))) |>
# removes spaces from the columns Genus, Family and Order
mutate( across(.cols = c(Genus, Family, Order), .fns = ~ gsub(" ", "", x =.)))
We then rename the modalities (see the legend in the first raw of the
file TABLE S1_COM1.xlsx).
levels(dfExp1$Modality) <- list("Sour rot Grisons" = "A",
"Sour rot Vaud" = "B",
"Sour rot Chasselas Nyon" = "C",
"Grey mould Chasselas Vaud" = "D")
We then load the second dataset.
### Study 2, Figure 2-7
dfExp2 <- read_excel(path = "Data/TABLE_S2_COM2.xlsx", skip = 1)
## removing the raw containing totals
ind_remove <- dfExp2 |> pull(`16S or ITS sequences named based on GenBank BLAST top score(s) after verification of the taxon current names in taxonomic databases (LPSN for bacteria and Mycobank/Index Fungorum for fungi)`) |> grep(pattern = "otal", x =_)
dfExp2 <- dfExp2[-ind_remove,]
# get the columns starting with a capital letter followed by a space (which correspond to the hits)
def_var <- grep("^[A-Z]\\s", names(dfExp2), value = TRUE)
dfExp2 <- dfExp2 |> select(`16S or ITS sequences named based on GenBank BLAST top score(s) after verification of the taxon current names in taxonomic databases (LPSN for bacteria and Mycobank/Index Fungorum for fungi)`,
all_of(def_var),`Order; family (after GenBank)`) |>
# renames the columns corresponding to the studies by the initial capital letter (which characterizes these columns)
rename_with(.fn = ~ str_sub(.x, start = 1L, end = 1L),.cols = matches("^[A-Z]\\s")) |>
# transform the dataset in tidy version
pivot_longer(cols = LETTERS[1:length(def_var)], names_to = "Experiment", values_to = "hits")
In the following, we rename the different modalities and create an
object df_categories that will be used as a lookup table
for the names of the categories.
# Replacement of multiple space with underscores
def_var <- gsub(" +", "_", def_var)
# Splitting and turn into data.frame
df_categories <- do.call(rbind, strsplit(def_var, "_")) |> as.data.frame(stringsAsFactors = TRUE)
# Renaming the columns
names(df_categories) <- c("Experiment", "State", "Protection", "Week")
# creating the explicit labels
lev_state <- c("HA" = "Healthy berries adj.", "UW" = "Unwounded", "W" = "Wounded")
lev_protect <- c("P" = "Protected", "UP" = "Unprotected")
df_categories <- df_categories |>
mutate(StateLabel = plyr::mapvalues(State, from = names(lev_state), to = lev_state),
ProtectionLabel = plyr::mapvalues(Protection, from = names(lev_protect), to = lev_protect))
dfExp2 <- dfExp2 |>
left_join(y = df_categories, by = "Experiment") |>
# rename unconvinent columns
rename(Species =`16S or ITS sequences named based on GenBank BLAST top score(s) after verification of the taxon current names in taxonomic databases (LPSN for bacteria and Mycobank/Index Fungorum for fungi)`,
OrderFamily = `Order; family (after GenBank)`) |>
# gets the order and the family
separate_wider_delim(OrderFamily, delim = "; ", names = c("Order", "Family")) |>
mutate( Family = factor(Family, levels = sort(unique(Family))),Order = factor(Order, levels = sort(unique(Order))) ) |>
# removes the unnecessary column
mutate(OrderFamily = NULL) |>
# removes some initial spaces and brackets
mutate(Genus = gsub(" .*$", "", x = Species) |> gsub("\\[|\\]", "", x=_)) |>
# removes unnecessary (i.e. with no hits) rows
filter(! is.na(hits))
We can now create the color palette that will be used for the Figures
1-7 and 10. To make sure that it is consistent throughout the
experiments 1 and 2 (it is only approximately consistent for the third
experiment), we need to first have loaded the data so that any genera
appearing has a single color assigned. The detailed code is available in
the file creating_palette_colors.R. Here, we only source
it.
### Creating the color palette
source(file = "creating_palette_colors.R") # creates the object df_col, a data frame containing the colors associated to each Genera and Order appearing in Studies 1 and 2 (Table S1 and S2).
To have a consistent aspect of the Figures, we also need to ensure the same order in which the genera and orders are displayed in the Figures.
dfExp1$Genus <- factor(dfExp1$Genus, levels = levels(df_col$Genus)) |> droplevels()
dfExp1$Order <- factor(dfExp1$Order, levels = levels(df_col$Order)) |> droplevels()
dfExp2$Genus <- factor(dfExp2$Genus, levels = levels(df_col$Genus)) |> droplevels()
dfExp2$Order <- factor(dfExp2$Order, levels = levels(df_col$Order)) |> droplevels()
We create some intermediary data frames that will then be used to produce the Figure 1.
dfExp1_GroupedName <- dfExp1 |>
select(Genus, Modality, hits) |>
group_by(Modality, Genus) |>
summarise(hits = sum(hits, na.rm = T))
dfExp1_GroupedName_Proportions <- dfExp1_GroupedName |> group_by(Modality) |> reframe( Genus = Genus, Proportion = hits/sum(hits, na.rm = T))
# data.frame to count the number of hits per modality
Exp1_Samp_Sizes <- dfExp1_GroupedName |> group_by(Modality) |> summarise(Size = sum(hits, na.rm = T)) |>
mutate(label = paste("n =", Size))
We can now produce the Figure 1 and save it.
pl1 <- ggplot(dfExp1_GroupedName_Proportions, mapping = aes(x = Modality, fill = Genus, y = Proportion)) +
geom_bar(stat = "identity", col = "white", width = .3) +
scale_y_continuous(labels = scales::percent_format(), breaks = seq(0,1, by = 0.1), minor_breaks = seq(0.05, to = 0.95, by = 0.1)) +
scale_x_discrete(limits = c( "Grey mould Chasselas Vaud","Sour rot Chasselas Nyon", "Sour rot Grisons","Sour rot Vaud"), labels =c( "Grey mould\n Chasselas Vaud","Sour rot\nChasselas Nyon", "Sour rot Grisons","Sour rot Vaud"))+
labs(x = "", y = "Proportion", fill = "Species") +
scale_fill_manual( values =df_col$ColName, breaks = df_col$Genus ) +
geom_text(data = Exp1_Samp_Sizes,
aes(x = Modality, y = 1.05, label = label),
inherit.aes = FALSE,
vjust = 0, size = 3) +
theme_minimal() +
theme(
panel.grid.major = element_line(color = "grey70", linewidth = 0.6),
panel.grid.minor = element_line(color = "grey85", linewidth = 0.4),
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank()
)
pl1
The Figures 2-6 are based on the second study (essentially the object
dfExp2).
The Figure 2 displays the evolution of the relative abundance (expressed in %) of sour rot associated microorganisms in wounded and unprotected grape berries of Chasselas over a three week period after their wounding, corresponding to Experiments F, G and J in Table S2 for the classification assigned to our sequences. Genera that are not within the Saccharomycotina are gathered in a single category.
# data.frame for the Figure 2
dfExp2_Figure2 <- dfExp2 |>
left_join(y = df_col, by = c("Genus", "Order")) |>
filter(Protection == "UP", State == "W") |>
group_by(Week,type) |>
summarise(hits = sum(hits)) |>
group_by(Week) |> reframe( type = type, Proportion = hits/sum(hits, na.rm = T), hits)
# data.frame of the sample sizes
dfExp2_Figure2_Size <- dfExp2_Figure2 |> group_by(Week) |> summarise(hits = sum(hits)) |> mutate(label = paste("n =", hits))
pl2 <- ggplot(data = dfExp2_Figure2, mapping = aes(x = Week, fill = type, y = Proportion)) +
geom_bar(stat = "identity", width = .3, col = "white") +
scale_y_continuous(labels = scales::percent_format(), breaks = seq(0,1, by = 0.1), minor_breaks = seq(0.05, to = 0.95, by = 0.1)) +
labs(x = "", y = "Proportion", fill = "Order", x = "") +
scale_fill_manual(values =df_col$Coltype, breaks = df_col$type ) +
scale_x_discrete(labels =paste("Week", 1:3)) +
geom_text(data = dfExp2_Figure2_Size,
aes(x = Week, y = 1.05, label = label),
inherit.aes = FALSE,
vjust = 0, size = 3) +
theme_minimal() +
theme(
panel.grid.major = element_line(color = "grey70", linewidth = 0.6),
panel.grid.minor = element_line(color = "grey85", linewidth = 0.4),
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank()
)
pl2
The Figure 3 displays the relative abundance of microorganisms isolated from unprotected grape clusters of Chasselas three weeks after the start of the experiment, corresponding to Experiments F, G and J in Table S2 for the classification assigned to our sequences.
dfExp2_Figure3 <- dfExp2 |>
filter(Protection == "UP", Week == 3) |>
group_by(State,Order) |>
summarise(hits = sum(hits)) |>
group_by(State) |> reframe(Order = Order, Proportion = hits/sum(hits, na.rm = T), hits)
dfExp2_Figure3_Size <- dfExp2_Figure3 |> group_by(State) |> summarise(hits = sum(hits)) |> mutate(label = paste("n =", hits))
pl3 <- ggplot(data = dfExp2_Figure3, mapping = aes(x = State, fill = Order, y = Proportion)) +
geom_bar(stat = "identity", width = .3, col = "white") +
scale_y_continuous(labels = scales::percent_format(), breaks = seq(0,1, by = 0.1), minor_breaks = seq(0.05, to = 0.95, by = 0.1)) +
labs( y = "Proportion", fill = "Order", x = "") +
scale_fill_manual( values =df_col$ColOrder, breaks = df_col$Order ) +
scale_x_discrete(limits = c("W", "HA", "UW"), labels = c("Wounded\nsymptomatic\nberries" ,"Unwounded\nadjacent\nberries", "Unwounded\nberries") ) +
geom_text(data = dfExp2_Figure3_Size,
aes(x = State, y = 1.05, label = label),
inherit.aes = FALSE,
vjust = -0.0, size = 3) +
theme_minimal() +
theme(
panel.grid.major = element_line(color = "grey70", linewidth = 0.6),
panel.grid.minor = element_line(color = "grey85", linewidth = 0.4),
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank()
)
pl3
We then show the Figure 4, which displays the evolution over three weeks of the microbial community inferred from the relative abundance (expressed in %) of fungal and bacterial orders present on the surface of unwounded berries from grape clusters of cv. Chasselas protected or not by a net in the vineyard plot of Nyon (see Table S2 for the classification of isolates at order rank), corresponding to Experiments H to L in Table S2 for the classification assigned to our sequences.
dfExp2_Figure4 <- dfExp2 |>
filter(State == "UW") |>
mutate(Week = paste("Week ", Week)) |>
group_by(Week, Order, Protection) |>
summarise(hits = sum(hits)) |>
group_by(Week, Protection) |> reframe(Week = Week, Protection = Protection,
Order = Order, Proportion = hits/sum(hits, na.rm = T), hits)
dfExp2_Figure4_Size <- dfExp2_Figure4 |> group_by(Protection, Week) |> summarise(hits = sum(hits)) |> mutate(label = paste("n =", hits))
pl4 <- ggplot(data = dfExp2_Figure4, mapping = aes(x = Protection, fill = Order, y = Proportion)) +
facet_wrap(~ Week) +
geom_bar(stat = "identity", col = "white", width = .3) +
scale_y_continuous(labels = scales::percent_format(), breaks = seq(0,1, by = 0.1), minor_breaks = seq(0.05, to = 0.95, by = 0.1)) +
labs( y = "Proportion", fill = "Order", x = "") +
scale_fill_manual( values =df_col$ColOrder, breaks = df_col$Order ) +
scale_x_discrete(limits = c("UP", "P"), labels = c("Netless" , "Net") ) +
geom_text(data = dfExp2_Figure4_Size,
aes(x = Protection, y = 1.05, label = label),
inherit.aes = FALSE,
vjust = -0.0, size = 3) +
theme_minimal() +
theme(
panel.grid.major = element_line(color = "grey70", linewidth = 0.6),
panel.grid.minor = element_line(color = "grey85", linewidth = 0.4),
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank()
)
pl4
The Figure 5 displays the evolution over three weeks of the microbial community inferred from the relative abundance of fungi and bacteria present in berries of Chasselas sampled from unprotected grape clusters of artificially wounded or intact berries in the vineyard plot of Nyon (thus excluding berries that are adjacent to symptomatic berries, corresponding to column G), corresponding to Experiments D-F and H-J in Table S2 for the classification assigned to our sequences.
dfExp2_Figure5 <- dfExp2 |>
# Only unprotected grapes
filter(Protection == "UP") |>
# Only Wounded and Unwounded categories
filter(State != "HA") |>
mutate(Week = paste("Week ", Week)) |>
mutate(State = plyr::mapvalues(State, from = c("W", "UW"), to = c("Wounded", "Unwounded")) |> droplevels()) |>
group_by(Week, Order, State) |>
summarise(hits = sum(hits)) |>
group_by(Week, State) |> reframe(Week = Week, State = State,
Order = Order, Proportion = hits/sum(hits, na.rm = T), hits)
dfExp2_Figure5_Size <- dfExp2_Figure5 |> group_by(State, Week) |> summarise(hits = sum(hits)) |> mutate(label = paste("n =", hits))
pl5 <- ggplot(data = dfExp2_Figure5, mapping = aes(x = State, fill = Order, y = Proportion)) +
facet_wrap(~ Week) +
geom_bar(stat = "identity", col = "white", width = .3) +
scale_y_continuous(labels = scales::percent_format(), breaks = seq(0,1, by = 0.1), minor_breaks = seq(0.05, to = 0.95, by = 0.1)) +
labs( y = "Proportion", fill = "Order", x = "") +
scale_fill_manual( values =df_col$ColOrder, breaks = df_col$Order ) +
# scale_x_discrete(limits = c("UP", "P"), labels = c("Netless" , "Net") ) +
geom_text(data = dfExp2_Figure5_Size, # Utiliser le data.frame des annotations
aes(x = State, y = 1.05, label = label),
inherit.aes = FALSE,# Placer les annotations au-dessus des barres
vjust = -0, size = 3) +
theme_minimal() +
theme(
panel.grid.major = element_line(color = "grey70", linewidth = 0.6), # Lignes principales
panel.grid.minor = element_line(color = "grey85", linewidth = 0.4),
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank()# Lignes secondaires
)
pl5
The Figures 6A and 6B display the evolution over time of the microbial community inferred from the relative abundance of fungal and bacterial orders (A) and genera (B) in berries of Chasselas grapes sampled from artificially wounded grape clusters protected or not by a net in a vineyard plot at Nyon, corresponding to Experiments A-F in Table S2 for the classification assigned to our sequences.
dfExp2_Figure6A <- dfExp2 |>
filter(State == "W") |>
mutate(Week = paste("Week ", Week)) |>
mutate(Protection = plyr::mapvalues(Protection, from = c("P", "UP"), to = c("Net", "Netless")) |> droplevels()) |>
group_by(Week, Order, Protection) |>
summarise(hits = sum(hits)) |>
group_by(Week, Protection) |> reframe(Week = Week, Protection = Protection,
Order = Order, Proportion = hits/sum(hits, na.rm = T), hits)
dfExp2_Figure6A_Size <- dfExp2_Figure6A |> group_by(Protection, Week) |> summarise(hits = sum(hits)) |> mutate(label = paste("n =", hits))
pl6A <- ggplot(data = dfExp2_Figure6A, mapping = aes(x = Protection, fill = Order, y = Proportion)) +
facet_wrap(~ Week) +
geom_bar(stat = "identity", col = "white", width = .3) +
scale_y_continuous(labels = scales::percent_format(), breaks = seq(0,1, by = 0.1), minor_breaks = seq(0.05, to = 0.95, by = 0.1)) +
labs( y = "Proportion", fill = "Order", x = "") +
scale_fill_manual( values =df_col$ColOrder, breaks = df_col$Order ) +
geom_text(data = dfExp2_Figure6A_Size,
aes(x = Protection, y = 1.05, label = label),
inherit.aes = FALSE,
vjust = -0.0, size = 3) +
theme_minimal() +
theme(
panel.grid.major = element_line(color = "grey70", linewidth = 0.6),
panel.grid.minor = element_line(color = "grey85", linewidth = 0.4),
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank()
)+
ggtitle(label = "A: Wounded berries", subtitle = "Distribution of the order among samples")
pl6A
#### Figure 6B
dfExp2_Figure6B <- dfExp2 |>
# Only unprotected grapes
# Only Wounded and Unwounded categories
filter(State != "UW") |>
mutate(Week = paste("Week ", Week)) |>
mutate(Protection = plyr::mapvalues(Protection, from = c("P", "UP"), to = c("Net", "Netless")) |> droplevels()) |>
group_by(Week, Genus, Protection) |>
summarise(hits = sum(hits)) |>
group_by(Week, Protection) |> reframe(Week = Week, Protection = Protection,
Genus = Genus, Proportion = hits/sum(hits, na.rm = T), hits)
dfExp2_Figure6B_Size <- dfExp2_Figure6B |> group_by(Protection, Week) |> summarise(hits = sum(hits)) |> mutate(label = paste("n =", hits))
pl6B <- ggplot(data = dfExp2_Figure6B, mapping = aes(x = Protection, fill = Genus, y = Proportion)) +
facet_wrap(~ Week) +
geom_bar(stat = "identity", col = "white", width = .3) +
scale_y_continuous(labels = scales::percent_format(), breaks = seq(0,1, by = 0.1), minor_breaks = seq(0.05, to = 0.95, by = 0.1)) +
labs( y = "Proportion", fill = "Genera") +
scale_fill_manual( values =df_col$ColName, breaks = levels(dfExp2_Figure6B$Genus)) +
geom_text(data = dfExp2_Figure6A_Size,
aes(x = Protection, y = 1.05, label = label),
inherit.aes = FALSE,
vjust = -0.0, size = 3) +
theme_minimal() +
theme(
panel.grid.major = element_line(color = "grey70", linewidth = 0.6),
panel.grid.minor = element_line(color = "grey85", linewidth = 0.4),
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank()
) +
ggtitle(label = "B: Wounded berries", subtitle = "Distribution of the genera among samples")
pl6B
The third study gathers data from four categories: healthy berries post-surface sterilization (internal), the surface of healthy non-sterilized berries (external),berries affected by sour rot (sour rot) of grapes cv. Gamay and of Drosophila spp. (insects) captured in a commercial vineyard in Begnins (VD, Switzerland).
dfExp3 <- read_excel(path = "Data/TABLE_S3_COM3.xlsx", skip = 1)
## removing the raw containing totals
ind_remove <- dfExp3 |> pull(`Sequences 16S or ITS named after verification in Mycobank/Index Fungorum of the name(s) obtained by sequence similarity searches in GenBank.`) |> grep(pattern = "otal", x =_)
dfExp3 <- dfExp3[-ind_remove,]
# get names of the columns corresponding to the hits
def_var <- grep("^[A-Z]\\s", names(dfExp3), value = TRUE)
dfExp3 <- dfExp3 |> select(`Sequences 16S or ITS named after verification in Mycobank/Index Fungorum of the name(s) obtained by sequence similarity searches in GenBank.`,
all_of(def_var),`Subphylum; order; family (after Mycobank)`) |>
# rename the columns corresponding to the experiment with only the initial capital letter
rename_with(.fn = ~ str_sub(.x, start = 1L, end = 1L),.cols = matches("^[A-Z]\\s")) |>
pivot_longer(cols = LETTERS[1:length(def_var)], names_to = "Experiment", values_to = "hits")
## In the following lines, we use the name of the variables to extract the information directly from their name
# Replacement of multiple space with underscores
def_var <- gsub(" +", "_", def_var)
# transform sour_rot into sour-rot
def_var <- gsub(pattern = "sour_rot", replacement = "sour-rot", def_var)
# transform pulp_+_skin into pulp-skin
def_var <- gsub(pattern = "pulp_+_skin", replacement = "pulp-skin", def_var, fixed = T)
# Splitting and turn into data.frame which then be used also as labels for the plots
df_categories <- do.call(rbind, strsplit(def_var, "_")) |> as.data.frame(stringsAsFactors = TRUE)
# Renaming the columns
names(df_categories) <- c("Experiment", "type", "Sterilization", "Material")
df_categories <- df_categories |> mutate(type = case_when(
type == "sour-rot" ~ "sour-rot",
type =="Dros" ~ "insect",
type == "heal" & Sterilization == "S" ~ "healthy sterilized\nberries",
type == "heal" & Sterilization == "US" ~ "extern"
))
df_categories <- df_categories |> select(Experiment, type)
all_ranks <- c("Subphylum", "Order", "Family", "Genus", "Species")
dfExp3 <-
dfExp3 |>
# rename unconvinent columns
rename(Species =`Sequences 16S or ITS named after verification in Mycobank/Index Fungorum of the name(s) obtained by sequence similarity searches in GenBank.`,
SubphylumOrderFamily = `Subphylum; order; family (after Mycobank)`) |>
# gets the order and the family
separate_wider_delim(SubphylumOrderFamily, delim = ";", names = c("Subphylum", "Order", "Family"), too_few = "align_start") |>
# transform NA into 0 (what they are!)
mutate(hits = if_else(is.na(hits), false = hits, true = 0)) |>
# removes the unnecessary column
mutate(OrderFamily = NULL) |>
# removing potential brackets in the names
mutate(Species = gsub("\\[|\\]", "", x=Species)) |>
# rextract the genus (as "what is before the first space")
mutate(Genus = gsub(" .*$", "", x = Species)) |>
# detect all non trivial spaces et replace them
mutate(across(.cols = all_of(all_ranks), .fns = ~ gsub("\\p{Zs}", " ", x =., perl = TRUE))) |>
mutate( across(.cols = all_of(all_ranks[!all_ranks == "Species"]), .fns = ~ gsub(" ", "", x =.))) |>
# replace the cells containing "?" (regex: "\\?") by a NA
mutate(across(.cols = all_of(all_ranks), .fns = ~ if_else(grepl(pattern = "\\?",x=. ), true = NA, false = .) )) |>
relocate(all_of(all_ranks))
The Figure 7 displays the relative abundance of fungal orders in these four categories, corresponding to the data of Table S3.
dfExp3_Figure7 <- dfExp3 |>
mutate(across(.cols = -c(hits, Order), .fns = factor)) |>
mutate(Experiment = plyr::mapvalues(Experiment, from = df_categories$Experiment, to = df_categories$type) |> droplevels()) |>
group_by(Order, Experiment) |>
summarise(hits = sum(hits)) |>
group_by(Experiment) |> reframe(Experiment = Experiment, Order = Order, Proportion = hits/sum(hits, na.rm = T), hits) |>
# creates the category of Saccharomycotina
mutate(type = if_else(Order %in% c("Ascoideales", "Pichiales","Dipodascales", "Saccharomycodales", "Serinales", "Phaffomycetales" ),
true = "Saccharomycotina",false = "Non-Saccharomycotina")) |>
distinct() |>
arrange(type, Order, desc(is.na(Order)))
# We create the levels to have some order: first the type (Saccharomycotina or not) and the alphabetic order
lev_order <- dfExp3_Figure7 |> select(type, Order) |> distinct() |> arrange(type, Order) |> pull(Order)
dfExp3_Figure7$Order <- factor(dfExp3_Figure7$Order, levels = lev_order)
dfExp3_Figure7_Size <- dfExp3_Figure7 |> group_by(Experiment) |> summarise(hits = sum(hits)) |> mutate(label = paste("n =", hits))
We create the colors assigned to each of the genera and orders of the data set. It is only approximately consistent with the colors used in the Figures 1-6.
df_col_Exp3 <- dfExp3 |> select(Order) |> distinct() |>
mutate(type = if_else(Order %in% c("Ascoideales", "Pichiales","Dipodascales", "Saccharomycodales", "Serinales", "Phaffomycetales" ),
true = "Saccharomycotina",false = "Non-Saccharomycotina")) |>
distinct() |>
arrange(is.na(Order),type, Order )
cat_colExp3 <- table(df_col_Exp3$type)
df_col_Exp3$ColOrder <- NA
df_col_Exp3$ColOrder[df_col_Exp3$type == "Non-Saccharomycotina"] <- colorspace::divergingx_hcl(n =cat_colExp3["Non-Saccharomycotina"],palette = "Cividis")
df_col_Exp3$ColOrder[df_col_Exp3$type == "Saccharomycotina"] <-
colorspace::sequential_hcl(n =(cat_colExp3["Saccharomycotina"]) + 2,palette = "Greens")[1:cat_colExp3["Saccharomycotina"]]
df_col_Exp3$ColOrder[which(is.na(df_col_Exp3$Order))] <- "#000000"
pl7 <- ggplot(data = dfExp3_Figure7, mapping = aes(x = Experiment, fill = Order, y = Proportion)) +
geom_bar(stat = "identity", color = "white", width = .3) +
scale_y_continuous(labels = scales::percent_format(), breaks = seq(0,1, by = 0.1), minor_breaks = seq(0.05, to = 0.95, by = 0.1)) +
labs( y = "Proportion", fill = "Order") +
scale_fill_manual( values =df_col_Exp3$ColOrder, breaks = df_col_Exp3$Order) +
# scale_x_discrete(limits = c("UP", "P"), labels = c("Netless" , "Net") ) +
geom_text(data = dfExp3_Figure7_Size, # Utiliser le data.frame des annotations
aes(x = Experiment, y = 1.05, label = label),
inherit.aes = FALSE,# Placer les annotations au-dessus des barres
vjust = -0.0, size = 3) +
theme_minimal() +
theme(
panel.grid.major = element_line(color = "grey70", linewidth = 0.6), # Lignes principales
panel.grid.minor = element_line(color = "grey85", linewidth = 0.4),
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank()# Lignes secondaires
)
pl7
We first load and preprocess the data and as well as import the
package metacoder.
source("Load_preprocess_sour_rot.R")
library(metacoder)
lsSourRot <- LoadSourRot(normalization = FALSE)
dfSourRot <- lsSourRot$dfSourRot
vars <- lsSourRot$vars
types <- lsSourRot$types
rm(lsSourRot)
We first show the number of samples for each modality.
types_names <- c("extern" = "healthy berries", "insect" = "Drosophilia suzuki", "sour rot" = "Sour rot affected berries")
samp_sizes <- table(types)
names(samp_sizes) <- types_names
as.data.frame(samp_sizes) |> rename(Modality = Var1, "Sample size" = Freq) |> knitr::kable()
| Modality | Sample size |
|---|---|
| healthy berries | 40 |
| Drosophilia suzuki | 18 |
| Sour rot affected berries | 66 |
Then, we create an object that can be parsed by the utilities of
metacoder. It essentially follows the steps described in
the documentation
of the package.
# creates a column that contains taxonomic information
dfSourRot$lineage <- apply(data.frame(root = "r__Root",apply(dfSourRot[,c("Subphylum", "order", "family", "genus", "species" )], 1, function(z)paste(c("p__","o__", "f__", "g__", "s__" ),z, sep="") ) |> t())
,1, paste0, collapse= ";")
obj <- parse_tax_data(dfSourRot[, -c(1:5)], # we do not consider the columns corresponding to species, genus, family and order
class_cols = "lineage", # the column that contains taxonomic information
class_sep = ";", # The character used to separate taxa in the classification
class_regex = "^(.+)__(.+)$", # Regex identifying where the data for each taxon is
class_key = c(tax_rank = "info", # A key describing each regex capture group
tax_name = "taxon_name"))
First, we briefly explore the number of hits for each sample and each modality.
samp_indices <- lapply(dfSourRot, is.numeric) |> unlist()
sample_hits <- cbind.data.frame(colSums(dfSourRot[,samp_indices]), types)
names(sample_hits) <- c("hits", "type")
ggplot(sample_hits, mapping = aes(y = hits, x = type, fill = type)) +
geom_boxplot()
We then compute the necessary tables that will then be used to display the absolute counts of each ITS. This is intrisically not normalized and comparison must be done with care. Indeed, by simply sampling more in one of the group would result in an imbalance in terms of counts, which may or may not be related to a difference in the distribution of the species in the various groups.
obj$data$tax_abund <- calc_taxon_abund(obj, "tax_data", cols = vars)
obj$data$tax_occ <- calc_n_samples(obj, "tax_abund", groups = types, cols = vars)
## computes the range of occurences. There is certainly a more efficient way to do it...
range_col <- obj$data$tax_occ |>
select(-taxon_id) |>
lapply( FUN = range) |>
do.call(args = _ , rbind) |>
as.data.frame() |>
summarise(Min = min(V1), Max = max(V2)) |>
unlist()
We then plots three heat trees that reflect the abundance. In this first three plots, we use the same color scale to allow for a comparison between the various counts. Note however that this must be done with care. Note that these plots are then also saved in a pdf file.
set.seed(1) # This makes the plot appear the same each time it is run
heat_tree(obj,
node_label = taxon_names,
node_size = n_obs,
node_color = extern,
title = "Surface of healthy berries",
title_size = 0.04,
node_size_axis_label = "ITS genotypes",
node_color_axis_label = "Samples with reads",
layout = "davidson-harel", # The primary layout algorithm
initial_layout = "reingold-tilford",
edge_color_interval = range_col,
output_file = paste0("Figures/heat_tree_extern", ext))
set.seed(1) # This makes the plot appear the same each time it is run
heat_tree(obj,
node_label = taxon_names,
node_size = n_obs,
node_color = insect,
title = "Drosophilia spp.",
title_size = 0.04,
node_size_axis_label = "ITS genotypes",
node_color_axis_label = "Samples with reads",
layout = "davidson-harel", # The primary layout algorithm
initial_layout = "reingold-tilford",
edge_color_interval = range_col,
output_file = paste0("Figures/heat_tree_insect", ext))
set.seed(1) # This makes the plot appear the same each time it is run
heat_tree(obj,
node_label = taxon_names,
node_size = n_obs,
node_color = obj$data$tax_occ$`sour rot`,
title = "Symptomatic berries",
title_size = 0.04,
node_size_axis_label = "ITS genotypes",
node_color_axis_label = "Samples with reads",
layout = "davidson-harel", # The primary layout algorithm
initial_layout = "reingold-tilford",
edge_color_interval = range_col,
output_file = paste0("Figures/heat_tree_sour_rot", ext))
The next three plots are the exact same, but without a common color scale. This allows one for an interpretation tailored to the group.
heat_tree(obj,
node_label = taxon_names,
node_size = n_obs,
node_color = extern,
title_size = 0.04,
title = "Surface of healthy berries",
node_size_axis_label = "ITS genotypes",
node_color_axis_label = "Samples with reads",
layout = "davidson-harel", # The primary layout algorithm
initial_layout = "reingold-tilford",
output_file = paste0("Figures/heat_tree_extern_ownscale", ext))
set.seed(1) # This makes the plot appear the same each time it is run
heat_tree(obj,
node_label = taxon_names,
node_size = n_obs,
node_color = insect,
title = 'Drosophilia spp.',
title_size = 0.04,
node_size_axis_label = "ITS genotypes",
node_color_axis_label = "Samples with reads",
layout = "davidson-harel", # The primary layout algorithm
initial_layout = "reingold-tilford",
output_file = paste0("Figures/heat_tree_insect_ownscale", ext))
set.seed(1) # This makes the plot appear the same each time it is run
heat_tree(obj,
node_label = taxon_names,
node_size = n_obs,
node_color = obj$data$tax_occ$`sour rot`,
title = "Symptomatic berries",
title_size = 0.04,
node_size_axis_label = "ITS genotypes",
node_color_axis_label = "Samples with reads",
layout = "davidson-harel", # The primary layout algorithm
initial_layout = "reingold-tilford",
output_file = paste0("Figures/heat_tree_sour_rot_ownscale", ext))
We also provide the plots without title (to be combined manually for the paper version).
set.seed(1) # This makes the plot appear the same each time it is run
heat_tree(obj,
node_label = taxon_names,
node_size = n_obs,
node_color = obj$data$tax_occ$`sour rot`,
# title_size = 0.04,
node_size_axis_label = "ITS genotypes",
node_color_axis_label = "Samples with reads",
layout = "davidson-harel", # The primary layout algorithm
initial_layout = "reingold-tilford",
edge_color_interval = range_col,
output_file = "Figures/Figure8_A_sour_rot.pdf")
set.seed(1) # This makes the plot appear the same each time it is run
heat_tree(obj,
node_label = taxon_names,
node_size = n_obs,
node_color = extern,
title_size = 0.04,
node_size_axis_label = "ITS genotypes",
node_color_axis_label = "Samples with reads",
layout = "davidson-harel", # The primary layout algorithm
initial_layout = "reingold-tilford",
edge_color_interval = range_col,
output_file = "Figures/Figure8_B_healthy_berries.pdf")
set.seed(1) # This makes the plot appear the same each time it is run
heat_tree(obj,
node_label = taxon_names,
node_size = n_obs,
node_color = insect,
title_size = 0.04,
node_size_axis_label = "ITS genotypes",
node_color_axis_label = "Samples with reads",
layout = "davidson-harel", # The primary layout algorithm
initial_layout = "reingold-tilford",
edge_color_interval = range_col,
output_file = "Figures/Figure8_C_Drosophilia.pdf")
As already mentioned, since samples are intrisically imbalanced, a direct comparion is not well suited. We thus resort to a relative abundance: for each sample, the proportion of counts are computed for each taxon.
obj$data$its_table <- calc_obs_props(obj, data = "tax_data", cols = vars) # for proportion
obj$data$tax_table <- calc_taxon_abund(obj, data = "its_table", cols = vars)
# to compare differences between groups and then plot it
obj$data$diff_table <- compare_groups(obj, data = "tax_table", cols = vars, groups = types)
obj <- mutate_obs(obj, "diff_table",
wilcox_p_value = p.adjust(wilcox_p_value, method = "fdr"))
obj$data$diff_table$log2_median_ratio[obj$data$diff_table$wilcox_p_value > 0.05] <- 0
obj$data$diff_table$median_diff[obj$data$diff_table$wilcox_p_value > 0.05] <- 0
Note that we have only kept the differences that were significant,
the other ones being set to 0. More precisely, we use a Mann-Whitney
test, with an adjustment for multiple testing based on the
Benjamini-Hochberg procedure (R function
p.adjust(method = "fdr")).
Now, it is possible to compare the groups pairwise in terms of proportion within each group (healthy berries, sour rot symptomatic berries and Drosophilia), which all are on the same scale, namely the interval \([0,1]\). For each edge (i.e. taxon), the median of the proportion of the corresponding taxon in the group considered is computed. Then, in the first plot that corresponds to the Figure 9 in the paper, the log (in base 2) of the ratio of the two medians (or equivalently, of the differences of the logs) is represented on the color scale, for each pair of groups. Note that, by convention, if any of the two medians is 0, then 0 is displayed. In the second plot, the difference between the two median proportion is plotted. Since the two medians both are in the interval \([0,1]\), the difference is in the interval \([-1,1]\).
A taxon colored blue is more abundant in the group in the column
while a taxon colored orange is more abundant in group of the row; see
also the color of the labels of rows and columns. See this paragraph
for implementation details and further clarifications about differential
heat trees implemented in metacoder.
custom_cols <- palette.colors()[c(6, 9, 7)] # subset of the Okabe-Ito palette
set.seed(1)
heat_tree_matrix(obj,
data = "diff_table",
node_size = n_obs, # n_obs is a function that calculates, in this case, the number of OTUs per taxon
node_label = taxon_names,
node_color = log2_median_ratio, # A column from `obj$data$diff_table`
node_color_range = custom_cols, # palette different
row_label_color = custom_cols[length(custom_cols)],
col_label_color = custom_cols[1],
node_color_trans = "area", # The default is scaled by circle area
node_color_interval = c(-3, 3), # The range of `log2_median_ratio` to display
edge_color_interval = c(-3, 3), # The range of `log2_median_ratio` to display
node_size_axis_label = "ITS genotypes",
node_color_axis_label = "Log 2 ratio median proportions",
layout = "davidson-harel", # The primary layout algorithm
initial_layout = "reingold-tilford", # The layout algorithm that initializes node locations
output_file = paste0("Figures/relative_log2_differential_heat_tree", ext))
heat_tree_matrix(obj,
data = "diff_table",
node_size = n_obs, # n_obs is a function that calculates, in this case, the number of OTUs per taxon
node_label = taxon_names,
node_color = median_diff, # A column from `obj$data$diff_table`
node_color_range = custom_cols, # palette different
row_label_color = custom_cols[length(custom_cols)],
col_label_color = custom_cols[1],
node_color_interval = c(-0.75, 0.75), # The range of `median_ratio` to display
edge_color_interval = c(-0.75, 0.75), # The range of `median_ratio` to display
node_size_axis_label = "ITS genotypes",
node_color_axis_label = "Difference of median proportions",
layout = "davidson-harel", # The primary layout algorithm
initial_layout = "reingold-tilford",
output_file = paste0("Figures/relative_median_differential_heat_tree", ext))
We also display the resulting \(p-\)values associated with all the taxon with significant differences in the various modalities.
names_taxon <- obj$data$class_data |> select(taxon_id, tax_name, tax_rank) |> distinct()
obj$data$diff_table |>
filter(wilcox_p_value < 0.05 & !is.na(wilcox_p_value) ) |>
select(taxon_id, treatment_1, treatment_2, wilcox_p_value, median_diff) |>
filter(median_diff != 0 ) |>
left_join(names_taxon, by = "taxon_id") |>
select(- taxon_id, - tax_rank) |>
relocate(tax_name, treatment_1, treatment_2) |>
rename("Taxon name" = tax_name, "Modality 1 (row)" = treatment_1, "Modality 2 (col)" = treatment_2, "p-val." = wilcox_p_value , "median differences" = median_diff) |>
knitr::kable()
| Taxon name | Modality 1 (row) | Modality 2 (col) | p-val. | median differences |
|---|---|---|---|---|
| Saccharomycotina | extern | sour rot | 0.0000000 | -0.7333333 |
| Peziomycotina | extern | sour rot | 0.0000000 | 0.6000000 |
| Pichiales | extern | sour rot | 0.0000000 | -0.4807692 |
| Pleosporales | extern | sour rot | 0.0000000 | 0.3095238 |
| Saccharomycodales | extern | sour rot | 0.0018433 | -0.1944444 |
| Pichiaceae | extern | sour rot | 0.0000000 | -0.4807692 |
| Pleosporaceae | extern | sour rot | 0.0000000 | 0.2678571 |
| Saccharomycodaceae | extern | sour rot | 0.0018433 | -0.1944444 |
| Pichia | extern | sour rot | 0.0000000 | -0.4807692 |
| Alternaria | extern | sour rot | 0.0000000 | 0.2500000 |
| Hanseniaspora | extern | sour rot | 0.0018433 | -0.1944444 |
| Alternaria sp. 1 | extern | sour rot | 0.0000000 | 0.2500000 |
| Saccharomycotina | extern | insect | 0.0008015 | -0.5718750 |
| Peziomycotina | extern | insect | 0.0003303 | 0.4819444 |
| Dipodascales | extern | insect | 0.0002991 | -0.0277778 |
| Eurotiales | extern | insect | 0.0009919 | -0.0555556 |
| Pichiales | extern | insect | 0.0025338 | -0.1885965 |
| Pleosporales | extern | insect | 0.0000473 | 0.3095238 |
| Aspergillaceae | extern | insect | 0.0009919 | -0.0555556 |
| Pichiaceae | extern | insect | 0.0025338 | -0.1885965 |
| Pleosporaceae | extern | insect | 0.0000422 | 0.2678571 |
| Penicillium | extern | insect | 0.0002148 | -0.0555556 |
| Pichia | extern | insect | 0.0025338 | -0.1885965 |
| Alternaria | extern | insect | 0.0000422 | 0.2500000 |
| Alternaria sp. 1 | extern | insect | 0.0000422 | 0.2500000 |
| Hanseniaspora uvarum gen 8 | extern | insect | 0.0063806 | -0.0900735 |
| Dipodascales | sour rot | insect | 0.0446839 | -0.0277778 |
| Eurotiales | sour rot | insect | 0.0001336 | -0.0555556 |
| Aspergillaceae | sour rot | insect | 0.0001336 | -0.0555556 |
| Penicillium | sour rot | insect | 0.0000019 | -0.0555556 |
Note that the published version of the Figure 9 is similar to
relative_log2_differential_heat_tree.pdf but with some
modifications made with Inkskape to
enhance readability and to provide the expected format for publication
(.eps). But only the sizes of the trees and of the legends
have been modified.
The Figure 10 displays the comparison of Saccharomycotina genera present on the surface of D. suzukii and on the surface of endemic Drosophila species. Fungal genera not associated with sour rout were coded as non-Saccharomycotina.
# Load the data with a flag indicating whether it is an endemic fly or a Suzukii
dfSourRot <- LoadSourRot(flag_suzuki = T)$dfSourRot
df_correspondance <- data.frame(NameVar = LoadSourRot(flag_suzuki = T)$vars, type = LoadSourRot(flag_suzuki = T)$type)
# Creates a table of correspondance
dfExp3_Figure10 <- df_correspondance |>
group_by(type) |>
reframe(hits = rowSums(dfSourRot[NameVar], na.rm = TRUE), Species = dfSourRot$species, Genus = dfSourRot$genus, Family= dfSourRot$genus, Order = dfSourRot$order) |>
pivot_wider(names_from = type, values_from = hits) |>
left_join(y = df_col_Exp3, by = "Order") |>
group_by(Genus, Order, type) |>
reframe(across(.cols =where(is.double) , .fns = ~sum(.x, na.rm= T))) |>
mutate(type = if_else(grepl(pattern = "unknown", Order), true = NA, false = type)) |>
mutate(type = if_else(Order == "Saccharomycetales", true = "Saccharomycotina", false = type)) |>
group_by(Genus) |>
reframe(across(.cols =where(is.double) , .fns = ~sum(.x, na.rm= T)), Order = Order, type = type) |>
pivot_longer(cols = -c(Genus, Order, type), names_to = "Experiment", values_to = "hits") |>
filter(Experiment %in% c("Endemic", "Suzukii")) |>
group_by(Experiment) |>
reframe(Experiment = Experiment, Genus = Genus, Proportion = hits/sum(hits, na.rm = T), hits, Order, type) |>
left_join(y = df_col |> select(Genus, ColName, ColOrder), by = "Genus") |>
filter(hits >0)
We set up the colors, essentially using the same as the ones used in the previous Figures.
# takes the same color for Non-Saccharomycotina as in Figure 2
col_NonSaccharomycotina <- df_col |> filter(type == "Non-Saccharomycotina") |> select(type, Coltype) |> pull(Coltype) |> unique()
dfExp3_Figure10$ColName[dfExp3_Figure10$type == "Non-Saccharomycotina"] <- col_NonSaccharomycotina
# We use the same colors for the genera of Figure 1-6
# We thus create a table of correspondance
new_cols <- dfExp3_Figure10 |> filter(type == "Saccharomycotina" & is.na(ColName)) |> select(Genus, ColName) |> distinct()
# In this data set, there are 2 genera that are not present in the data sets of Table_S2 and S3.
# We assign the same colors to the genera that appeared in the Figure 1-6
# For the two extra genera, we take the last two colors (!)
# of the palette used for the Figure 1-6
new_cols$ColName <- colorspace::sequential_hcl(n =(cat_col["Saccharomycotina"]) + 2,palette = "Greens")[(cat_col["Saccharomycotina"] + 2):(cat_col["Saccharomycotina"] + 1)]
dfExp3_Figure10 <- left_join(dfExp3_Figure10, y = new_cols, by = c("Genus")) |>
mutate(ColName = coalesce(ColName.x, ColName.y), ColName.x = NULL, ColName.y = NULL) |>
mutate(Genus = case_when(type == "Non-Saccharomycotina" ~"Non-Saccharomycotina",
is.na(type) & grepl(pattern = "unknown", x = Genus)~NA,
.default = Genus)) |>
arrange(is.na(Genus), type, Genus)
We then rearrange the levels of the genera of the Figure 10 and compute the sample sizes.
lev_Genus <- unique(dfExp3_Figure10$Genus)
lev_Genus <- c(lev_Genus[lev_Genus == "Non-Saccharomycotina" & !is.na(lev_Genus)],
sort(lev_Genus[! lev_Genus %in% c("Fungus","Non-Saccharomycotina")]),
lev_Genus[lev_Genus== "Fungus"])
dfExp3_Figure10$Genus <- factor(dfExp3_Figure10$Genus, levels = lev_Genus)
dfExp3_Figure10_Size <- dfExp3_Figure10 |> group_by(Experiment) |> summarise(hits = sum(hits)) |> mutate(label = paste("n =", hits))
# just extract the colors to be used afterwards
df_col10 <- dfExp3_Figure10 |> select(Genus, ColName) |> distinct()
df_col10$ColName[df_col10$Genus == "Dipodascaceae"] <- "#004616"
We can now plot the Figure 10.
pl10 <- ggplot(data = dfExp3_Figure10 |> group_by(Experiment, Genus) |> summarise(Proportion = sum(Proportion)), mapping = aes(x = Experiment, fill = Genus, y = Proportion)) +
geom_bar(stat = "identity", col = "white", width = .3) +
scale_y_continuous(labels = scales::percent_format(), breaks = seq(0,1, by = 0.1), minor_breaks = seq(0.05, to = 0.95, by = 0.1)) +
labs( y = "Proportion", fill = "Taxa") +
scale_fill_manual( values =df_col10$ColName, breaks = df_col10$Genus) +
geom_text(data = dfExp3_Figure10_Size,
aes(x = Experiment, y = 1.05, label = label),
inherit.aes = FALSE,
vjust = -0.0, size = 3) +
theme_minimal() +
theme(
panel.grid.major = element_line(color = "grey70", linewidth = 0.6),
panel.grid.minor = element_line(color = "grey85", linewidth = 0.4),
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank()
)
pl10